{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d946c3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ndjson\n",
    "import os\n",
    "import json\n",
    "import random\n",
    "import math\n",
    "import re\n",
    "from functools import partial\n",
    "from tqdm import tqdm\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "318021ac",
   "metadata": {},
   "source": [
    "# The Stack Code"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67bb9ddb",
   "metadata": {},
   "source": [
    "## Full Dataset Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbd1a41b",
   "metadata": {},
   "outputs": [],
   "source": [
    "cumsize = 0\n",
    "cumtokens = 0\n",
    "with open(\"../meta_json/stats.json\") as f: \n",
    "    stats = json.load(f)\n",
    "    \n",
    "for key in stats:\n",
    "    print(key.upper())\n",
    "    tokens = stats[key][\"tokens\"]/10**9\n",
    "    cumtokens += tokens\n",
    "    print(f\"tokens: {tokens:.4f} B\")\n",
    "    size = stats[key][\"size\"]/10**9\n",
    "    cumsize += size\n",
    "    print(f\"size: {size:.4f} GB\\n\")\n",
    "\n",
    "print(\"CUMULATIVE:\")\n",
    "print(f\"tokens: {cumtokens:.4f} B\")\n",
    "print(f\"size: {cumsize:.4f} GB\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e52e126f",
   "metadata": {},
   "outputs": [],
   "source": [
    "pairs = [(key.title(), stats[key][\"tokens\"]) for key in stats]\n",
    "\n",
    "pairs = sorted(pairs, key = lambda x: -x[1])\n",
    "\n",
    "plt.bar([x[0] for x in pairs], [x[1] for x in pairs])\n",
    "plt.ylabel('Tokens')\n",
    "# plt.yscale('log')\n",
    "plt.xticks(rotation=-90)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9f91f4c",
   "metadata": {},
   "source": [
    "## Defining generic data analysis utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "654adc08",
   "metadata": {},
   "outputs": [],
   "source": [
    "def numerical_density(ex):\n",
    "    # The ratio of digit characters over non-digit characters in the file\n",
    "    txt = ''.join(ex[\"text\"].split())\n",
    "    ntoks = sum(txt.count(c) for c in \"0123456789\")\n",
    "    return ntoks / len(txt)\n",
    "\n",
    "def print_ex(example): \n",
    "    text = example[\"text\"]\n",
    "    print(f\"numeric density: {numerical_density(example)}\")\n",
    "    print(f\"length (characters): {len(text)}\")\n",
    "    print(example[\"meta\"][\"max_stars_repo_name\"])\n",
    "    print(example[\"meta\"][\"max_stars_repo_path\"] + \"\\n\" + \"#\"*40 + \"\\n\")\n",
    "    print(text)\n",
    "    \n",
    "class Printer: \n",
    "    def __init__(self, data):\n",
    "        self.data = data\n",
    "        self.index = 0 \n",
    "        self.rindex = len(data)-1\n",
    "    def print_head(self): \n",
    "        print(f\"index: {self.index}\")\n",
    "        print_ex(self.data[self.index])\n",
    "        self.index += 1\n",
    "    def print_tail(self): \n",
    "        print(f\"index: {self.rindex}\")\n",
    "        print_ex(self.data[self.rindex])\n",
    "        self.rindex -= 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f112af5",
   "metadata": {},
   "source": [
    "## Per-language analysis\n",
    "Workflow for manually inspecting the quality of samples in a particular language. \n",
    "\n",
    "The stack has a ton of data quality issues so this is important"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6f2598c",
   "metadata": {},
   "outputs": [],
   "source": [
    "lang = \"r\"\n",
    "shard = \"0000\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dddc7dde",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(f\"../data_jsonl/train/{lang}{shard}.jsonl\") as f: \n",
    "    ds = ndjson.load(f)\n",
    "\n",
    "print(\"len: \", len(ds))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94b7b04b",
   "metadata": {},
   "source": [
    "### Random Sample Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85d7e66e",
   "metadata": {},
   "outputs": [],
   "source": [
    "shuffle_ds = ds\n",
    "random.shuffle(ds)\n",
    "shuffle_printer = Printer(shuffle_ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc9be8b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "shuffle_printer.print_head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2759ad8a",
   "metadata": {},
   "source": [
    "### Length analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f61c4ee6",
   "metadata": {},
   "outputs": [],
   "source": [
    "length_ds = sorted(ds, key= lambda x: len(x[\"text\"]))\n",
    "length_printer = Printer(length_ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d366a88",
   "metadata": {},
   "outputs": [],
   "source": [
    "length_printer.print_head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fcf7d68",
   "metadata": {},
   "outputs": [],
   "source": [
    "length_printer.print_tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00ee3010",
   "metadata": {},
   "source": [
    "### Numerical density analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de1cc344",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_ds = sorted(ds, key= lambda x: numerical_density(x))\n",
    "num_printer = Printer(num_ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa8ace28",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_printer.print_head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f65dbf72",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_printer.rindex += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae4e2eb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_printer.print_tail()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
